# scripts/count_tokens.py
def count_token_ids(file_path):
    count = 0
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:  # 避免空行
                count += len(line.split())
    return count

# 替换成你的文件路径
file_path = '../data/pretrain_corpus_tokens_IDs'
total_tokens = count_token_ids(file_path)
print(f"Total number of token IDs: {total_tokens}")